# -*- coding: utf-8 -*-
"""
Created on Wed Apr 19 18:27:30 2017

@author: BaiYunfei
"""
import pandas as pd
from sklearn import cluster


def cal_sim(l):
    dic_list = []
    for i in l:
        text_dic = {}
        for c in str(i):
            if text_dic.get(c):
                text_dic[c] += 1
            else:
                text_dic[c] = 1
        dic_list.append(text_dic)
    data = pd.DataFrame(dic_list).fillna(0)
    #去重
    model = cluster.DBSCAN(min_samples=1).fit(data)
    data['label'] = model.labels_
    return data.T.corr().mean().mean()

def rm_dupl(l):
    dic = {}
    for i in l:
        if dic.get(i):
            dic[i] += 1
        else:
            dic[i] = 1
            
    max_count = 1
    max_k = ''
    for k in dic.keys():
        if dic[k] > max_count:
            max_count = dic[k]
            max_k = k
    return len(dic.keys())/max_count